> Agent Memory Systems
Budding
planted Jan 8, 2026tended Jan 8, 2026
#ai-agents#memory#vector-databases#context-management
Agent Memory Systems
πΏ Budding note β memory architectures for intelligent agents.
Why Memory Matters
Agents need memory to:
- Maintain context across conversations
- Learn from experience and improve over time
- Recall past interactions for personalization
- Avoid repeating mistakes
- Build knowledge incrementally
Related: AI Agents Fundamentals for core concepts
Types of Memory
1. Short-Term Memory (Working Memory)
Purpose: Immediate conversation context
class ShortTermMemory:
"""Sliding window of recent messages"""
def __init__(self, max_messages: int = 10):
self.messages = []
self.max_messages = max_messages
def add(self, role: str, content: str):
"""Add message to memory"""
self.messages.append({
"role": role,
"content": content,
"timestamp": time.time()
})
# Keep only recent messages
if len(self.messages) > self.max_messages:
self.messages.pop(0)
def get_context(self) -> list[dict]:
"""Get messages for LLM"""
return [
{"role": msg["role"], "content": msg["content"]}
for msg in self.messages
]
def clear(self):
"""Clear working memory"""
self.messages = []
Usage:
memory = ShortTermMemory(max_messages=10)
memory.add("user", "What's the weather in Tokyo?")
memory.add("assistant", "It's 18Β°C and partly cloudy in Tokyo")
memory.add("user", "What about Paris?")
# Get context for next LLM call
messages = memory.get_context()
2. Long-Term Memory (Vector Store)
Purpose: Persistent knowledge retrieval
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from sentence_transformers import SentenceTransformer
class LongTermMemory:
"""Vector database for semantic search"""
def __init__(self, collection_name: str = "memories"):
self.client = QdrantClient(":memory:") # Or remote URL
self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
self.collection = collection_name
# Create collection
self.client.create_collection(
collection_name=self.collection,
vectors_config=VectorParams(
size=384, # Model embedding size
distance=Distance.COSINE
)
)
def store(self, text: str, metadata: dict = None):
"""Store memory with semantic embedding"""
vector = self.encoder.encode(text).tolist()
point = PointStruct(
id=hash(text) % (10 ** 8), # Simple ID generation
vector=vector,
payload={
"text": text,
"timestamp": time.time(),
**(metadata or {})
}
)
self.client.upsert(
collection_name=self.collection,
points=[point]
)
def recall(self, query: str, limit: int = 5) -> list[dict]:
"""Retrieve relevant memories"""
query_vector = self.encoder.encode(query).tolist()
results = self.client.search(
collection_name=self.collection,
query_vector=query_vector,
limit=limit
)
return [
{
"text": hit.payload["text"],
"score": hit.score,
"timestamp": hit.payload["timestamp"],
**{k: v for k, v in hit.payload.items()
if k not in ["text", "timestamp"]}
}
for hit in results
]
def forget(self, query: str, threshold: float = 0.9):
"""Remove similar memories"""
results = self.recall(query, limit=10)
ids_to_delete = [
hit["id"] for hit in results
if hit["score"] > threshold
]
if ids_to_delete:
self.client.delete(
collection_name=self.collection,
points_selector=ids_to_delete
)
Usage:
ltm = LongTermMemory()
# Store experiences
ltm.store("User prefers technical explanations", {"category": "preference"})
ltm.store("Previous bug: SQL injection in login form", {"category": "bug"})
ltm.store("Successfully deployed to production on 2026-01-05", {"category": "milestone"})
# Recall relevant memories
memories = ltm.recall("How should I explain this?", limit=3)
# Returns: [{"text": "User prefers technical explanations", ...}]
3. Episodic Memory
Purpose: Remember specific events and conversations
from dataclasses import dataclass
from datetime import datetime
from typing import List
@dataclass
class Episode:
"""Single conversation episode"""
id: str
timestamp: datetime
messages: List[dict]
summary: str
outcome: str # "success", "failure", "partial"
tags: List[str]
class EpisodicMemory:
"""Store and retrieve conversation episodes"""
def __init__(self):
self.episodes: List[Episode] = []
self.vector_store = LongTermMemory(collection_name="episodes")
def store_episode(
self,
messages: List[dict],
summary: str,
outcome: str,
tags: List[str] = None
):
"""Save completed episode"""
episode = Episode(
id=str(uuid.uuid4()),
timestamp=datetime.now(),
messages=messages,
summary=summary,
outcome=outcome,
tags=tags or []
)
self.episodes.append(episode)
# Store in vector DB for semantic search
self.vector_store.store(
text=summary,
metadata={
"episode_id": episode.id,
"outcome": outcome,
"tags": tags
}
)
def recall_similar_episodes(self, query: str, limit: int = 3) -> List[Episode]:
"""Find similar past episodes"""
memories = self.vector_store.recall(query, limit=limit)
return [
self._get_episode_by_id(mem["episode_id"])
for mem in memories
]
def _get_episode_by_id(self, episode_id: str) -> Episode:
"""Retrieve full episode"""
return next(
(ep for ep in self.episodes if ep.id == episode_id),
None
)
def get_success_rate(self, tag: str = None) -> float:
"""Calculate success rate for tasks"""
episodes = self.episodes
if tag:
episodes = [ep for ep in episodes if tag in ep.tags]
if not episodes:
return 0.0
successful = sum(1 for ep in episodes if ep.outcome == "success")
return successful / len(episodes)
Usage:
episodic = EpisodicMemory()
# After completing a task
episodic.store_episode(
messages=conversation_history,
summary="User asked for weather in Tokyo, successfully fetched and responded",
outcome="success",
tags=["weather", "api_call", "tokyo"]
)
# Learn from past experiences
similar = episodic.recall_similar_episodes("How to fetch weather data?")
# Returns episodes about weather queries
# Check performance
success_rate = episodic.get_success_rate(tag="api_call")
print(f"API call success rate: {success_rate:.1%}")
Memory Integration in Agents
Hierarchical Memory Architecture
class AgentWithMemory:
"""Agent with integrated memory systems"""
def __init__(self):
self.short_term = ShortTermMemory(max_messages=10)
self.long_term = LongTermMemory()
self.episodic = EpisodicMemory()
async def process(self, user_message: str) -> str:
"""Process message with memory"""
# 1. Add to short-term memory
self.short_term.add("user", user_message)
# 2. Recall relevant long-term memories
relevant_memories = self.long_term.recall(user_message, limit=3)
# 3. Find similar past episodes
similar_episodes = self.episodic.recall_similar_episodes(user_message, limit=2)
# 4. Build enriched context
context = self._build_context(
short_term=self.short_term.get_context(),
long_term=relevant_memories,
episodes=similar_episodes
)
# 5. Generate response
response = await self.llm.generate(context)
# 6. Update memories
self.short_term.add("assistant", response)
self.long_term.store(f"Q: {user_message}\nA: {response}")
return response
def _build_context(self, short_term, long_term, episodes) -> str:
"""Combine memory types into prompt"""
context = "# Current Conversation\n"
for msg in short_term:
context += f"{msg['role']}: {msg['content']}\n"
if long_term:
context += "\n# Relevant Past Information\n"
for mem in long_term:
context += f"- {mem['text']}\n"
if episodes:
context += "\n# Similar Past Interactions\n"
for ep in episodes:
context += f"- {ep.summary} (outcome: {ep.outcome})\n"
return context
Memory Optimization
1. Summarization for Context Window
class SummarizingMemory:
"""Compress old messages to fit context"""
def __init__(self, llm, max_tokens: int = 4000):
self.llm = llm
self.max_tokens = max_tokens
self.messages = []
self.summary = ""
async def add(self, role: str, content: str):
"""Add message, summarize if needed"""
self.messages.append({"role": role, "content": content})
# Check token count
current_tokens = self._estimate_tokens()
if current_tokens > self.max_tokens:
await self._summarize_and_compress()
async def _summarize_and_compress(self):
"""Summarize old messages"""
# Keep last 5 messages, summarize the rest
to_summarize = self.messages[:-5]
recent = self.messages[-5:]
if to_summarize:
summary_text = "\n".join(
f"{msg['role']}: {msg['content']}"
for msg in to_summarize
)
new_summary = await self.llm.generate(
f"Summarize this conversation:\n{summary_text}"
)
self.summary += f"\n{new_summary}"
self.messages = recent
def _estimate_tokens(self) -> int:
"""Rough token estimation"""
total_chars = sum(len(msg["content"]) for msg in self.messages)
total_chars += len(self.summary)
return total_chars // 4 # ~4 chars per token
def get_context(self) -> list[dict]:
"""Get compressed context"""
context = []
if self.summary:
context.append({
"role": "system",
"content": f"Previous conversation summary:\n{self.summary}"
})
context.extend(self.messages)
return context
2. Tiered Storage
class TieredMemory:
"""Hot/warm/cold memory tiers"""
def __init__(self):
self.hot = [] # In-memory, instant access
self.warm = LongTermMemory() # Vector DB, fast
self.cold = {} # S3/disk, slow but cheap
def store(self, memory: dict, tier: str = "hot"):
"""Store in appropriate tier"""
if tier == "hot":
self.hot.append(memory)
elif tier == "warm":
self.warm.store(memory["text"], memory.get("metadata"))
elif tier == "cold":
memory_id = str(uuid.uuid4())
self.cold[memory_id] = memory
# In production: upload to S3
# boto3.client('s3').put_object(...)
def recall(self, query: str) -> list:
"""Search across tiers"""
# Check hot tier first
hot_results = [m for m in self.hot if query.lower() in m["text"].lower()]
# Then warm tier
warm_results = self.warm.recall(query, limit=5)
# Combine results
return hot_results + warm_results
def promote_to_hot(self, memory_id: str):
"""Move frequently accessed memories to hot tier"""
if memory_id in self.cold:
memory = self.cold.pop(memory_id)
self.hot.append(memory)
3. Selective Retention
class SelectiveMemory:
"""Only remember important information"""
def __init__(self, llm):
self.llm = llm
self.memory = LongTermMemory()
async def should_remember(self, interaction: str) -> bool:
"""Decide if interaction is worth remembering"""
prompt = f"""Rate the importance of remembering this interaction (0-10):
{interaction}
Consider:
- Does it contain facts the user taught me?
- Does it reveal user preferences?
- Is it relevant for future interactions?
- Is it just small talk?
Respond with just a number 0-10."""
score = await self.llm.generate(prompt)
return int(score.strip()) >= 7
async def store_if_important(self, interaction: str, metadata: dict = None):
"""Selectively store memories"""
if await self.should_remember(interaction):
self.memory.store(interaction, metadata)
return True
return False
Memory Challenges
1. Context Window Limits
Problem: LLMs have finite context windows Solution: Summarization + selective retrieval
# Instead of full history
messages = all_messages # Might exceed context
# Use compressed context
relevant = memory.recall(current_query, limit=5)
recent = messages[-10:] # Last 10 messages
context = relevant + recent # Fits in window
2. Memory Consistency
Problem: Conflicting information in memory Solution: Versioning and conflict resolution
class VersionedMemory:
def store(self, key: str, value: str):
"""Store with version tracking"""
if key in self.memory:
self.memory[key]["versions"].append({
"value": value,
"timestamp": time.time()
})
else:
self.memory[key] = {
"current": value,
"versions": [{"value": value, "timestamp": time.time()}]
}
def get_latest(self, key: str) -> str:
"""Get most recent version"""
if key in self.memory:
return self.memory[key]["versions"][-1]["value"]
return None
3. Privacy and Forgetting
Problem: Users want data deleted Solution: Right to be forgotten
class PrivacyAwareMemory:
def forget_user_data(self, user_id: str):
"""Delete all memories for user"""
# Delete from vector store
self.vector_store.client.delete(
collection_name=self.collection,
points_selector={
"filter": {
"must": [{"key": "user_id", "match": {"value": user_id}}]
}
}
)
# Delete episodes
self.episodes = [
ep for ep in self.episodes
if ep.metadata.get("user_id") != user_id
]
Related: Agent Security Considerations
Production Patterns
1. Distributed Memory
import redis
class DistributedMemory:
"""Share memory across agent instances"""
def __init__(self, redis_url: str):
self.redis = redis.from_url(redis_url)
def store(self, key: str, value: dict, ttl: int = 3600):
"""Store with expiration"""
self.redis.setex(
key,
ttl,
json.dumps(value)
)
def recall(self, key: str) -> dict:
"""Retrieve from shared memory"""
value = self.redis.get(key)
return json.loads(value) if value else None
2. Async Memory Operations
class AsyncMemory:
"""Non-blocking memory operations"""
async def store_async(self, text: str, metadata: dict):
"""Store without blocking agent"""
await asyncio.create_task(
self._background_store(text, metadata)
)
async def _background_store(self, text: str, metadata: dict):
"""Background storage task"""
vector = await self.encoder.encode_async(text)
await self.vector_store.upsert_async(vector, metadata)
Related: Production Agent Deployment
Connection Points
Prerequisites:
- AI Agents Fundamentals β Agent basics
- Tool Use and Function Calling β Memory as a tool
Related:
- Claude Agent Patterns β Claude-specific memory handling
- Agent Security Considerations β Memory security
- Building Agents with LangChain β LangChain memory modules
Advanced:
- Production Agent Deployment β Scaling memory systems
- Agent Evaluation & Testing β Testing memory recall
>> referenced by (7)
Agent Security Considerations
...security Related: - [[Claude Agent Patterns]] β Claude security features - [[Agent Memory Systems]] β Memory security - [[Production Agent Deployment]] β Production security **Te...
AI Agents
...- [[AI Agents Fundamentals]] πΏ β Components, patterns, and agentic behavior - [[Agent Memory Systems]] πΏ β Short-term, long-term, and episodic memory - [[Multi-Agent Systems]] πΏ β...
AI Agents Fundamentals
...limit ) return [hit.payload for hit in results] ``` Deep dive: [[Agent Memory Systems]] 3. Tool Use Agents extend their capabilities through tools: **Tool defin...
Building Agents with LangChain
...oke({"input": "Multiply that by 2"}) Remembers previous result `` Related: [[Agent Memory Systems]] 3. OpenAI Functions Agent Uses function calling: ``python from langchai...
Claude Agent Patterns
...{query}\nA: {response.content[0].text}") return response `` Related: [[Agent Memory Systems]] Self-Reflection Pattern Claude critiques its own work: ``python async de...
Multi-Agent Systems
...ints Prerequisites: - [[AI Agents Fundamentals]] β Single-agent patterns - [[Agent Memory Systems]] β Shared memory Related: - [[Agent Frameworks Comparison]] β CrewAI, LangG...
Production Agent Deployment
...on]] β Framework deployment - [[Claude Agent Patterns]] β Claude optimization - [[Agent Memory Systems]] β Production memory Testing: - [[Agent Evaluation & Testing]] β Pre-produc...